In [4]:
from __future__ import print_function
import numpy as np
import pandas as pd
from collections import OrderedDict #sorting participant df dict before pd.concat()
import matplotlib.pylab as plt
%matplotlib inline
pd.options.display.mpl_style = 'default'
import cPickle as pickle
non_english_fluent = ['023', '031', '045', '050', '070', '106',]
left_handed = ['042', '088',]
pro_inst_skill = ['026', '037']
excluded_all_tasks = non_english_fluent + left_handed + pro_inst_skill
In [5]:
def col_matches(df, regex):
'returns a list of columns in a df that match a regex string.'
import re
cols = list(enumerate(df.columns))
matches = [c for (i, c) in cols
if re.findall(regex, c)]
return matches
def compare_transformations(df, columns, functions, **kwargs):
print('raw')
df[columns].hist(**kwargs)
plt.show()
for name, func in functions.items():
print(name)
df[columns].apply(func).hist(**kwargs)
plt.show()
def quickcompare(r, df, size=(15,7)):
inverse = lambda x: 1.0/x
return compare_transformations(df, col_matches(df, r),
{'inverse': inverse,
'log1p': np.log1p,
'sqrt': np.sqrt,
'log': np.log,
},
figsize=size)
In [10]:
# using this for inline documentation so that it's clear
# that the printing statement isn't part of the necessary
# transformation code.
def html_print(df):
try:
out = df.to_html()
except AttributeError:
out = pd.DataFrame(df).to_html()
return HTML(out)
def htmljoin(df_list, delimiter=''):
return HTML(delimiter.join([x.to_html() for x in df_list]))
def col_matches(df, regex):
import re
cols = list(enumerate(df.columns))
matches = [c for (i, c) in cols
if re.findall(regex, c)]
return matches
def concat_matches(df, *args):
assert all([len(r) for r in args])
import re
col_match_lists = [col_matches(df, regex) for regex in args]
col_set = [df[matches] for matches in col_match_lists]
if len(col_set) == 0:
return None
elif len(col_set) == 1:
return col_set[0]
else:
return pd.concat(col_set, axis=1)
def show_frames(frame_list, delimiter=''):
from IPython.display import HTML
if len(frame_list) == len(delimiter):
html_out = ""
item_template = '<p><strong>{}</strong></p>{}<br>'
for i, tup in enumerate(zip(frame_list, delimiter)):
frame = tup[0]
tag = tup[1]
html_out += item_template.format(tag, frame.to_html())
return HTML(html_out)
else:
html_out = [df.to_html() for df in frame_list]
return HTML(delimiter.join(html_out))
In [11]:
pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.pickle"
updated_scales = '2014-10-29a'
updated_isip = '2014-10-12b'
updated_sms = '2014-10-20a'
sms_df = pd.read_pickle(pfilenames.format(measure = 'sms', updated = updated_sms))
scales_df = pd.read_pickle(pfilenames.format(measure = 'scales',
updated = updated_scales))
isip_measures = ['isip5_lag2',
'isip5_avgprev4',
'isip8_lag2',
'isip8_avgprev4',
]
isip_frames = {m: pd.read_pickle(pfilenames.format(measure = m,
updated = updated_isip))
for m in isip_measures}
sms_frames = {'sms': sms_df}
scales_frames = {'scales': scales_df}
In [12]:
from itertools import chain
task_output = OrderedDict(chain(scales_frames.items(),
sms_frames.items(),
isip_frames.items(),
))
In [17]:
dfo = pd.concat(task_output.values(),
axis=1, #defaults
join='outer',
keys=task_output.keys(),
names=['set'],
)
to_drop = set(dfo.index).intersection(excluded_all_tasks)
if to_drop:
print("Dropping: {}".format(list(to_drop)))
dfo = dfo.drop(to_drop)
assert len(set(dfo.index).intersection(excluded_all_tasks))==0
In [18]:
full_updated = '2014-10-29a'
pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.pickle"
output_file = pfilenames.format(measure='full', updated=full_updated)
pickle.dump(dfo, open(output_file, "wb"))
In [22]:
# To use this outside of pandas (R) we'll want to make a flat table
# where the outer index is converted to a prefix for the variable name.
top_level_label = {'scales': 'SCAL',
'sms': 'SMSR',
'isip8_lag2': 'I8L2',
'isip5_lag2': 'I5L2',
'isip8_avgprev4': 'I8P4',
'isip5_avgprev4': 'I5P4',
}
dfo_flat_subset = dfo.copy().xs(top_level_label.keys(), axis=1)
dfo_flat_subset.columns = ['_'.join([top_level_label[outer], inner])
for (outer, inner) in dfo_flat_subset.columns]
#shorten variable name for export
dfo_flat_subset.columns = [c.replace('avgprev4sq_', "") for c in dfo_flat_subset.columns]
In [23]:
full_updated = '2014-10-29a'
pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.{ext}"
output_file_csv = pfilenames.format(measure='flat', updated=full_updated, ext="csv")
output_file_pickle = pfilenames.format(measure='flat', updated=full_updated, ext="pickle")
pickle.dump(dfo_flat_subset, open(output_file_pickle, "wb"))
# Re-code NaNs for external analyses
dfo_coded_nan = dfo_flat_subset.replace(np.nan, 77777)
dfo_coded_nan.to_csv(output_file_csv)
In [24]:
dfo_flat_subset.T[::15]
Out[24]:
In [26]:
#quickcompare('DPsd$', df=dfo_flat_subset)
# demonstrates that the inverse transform works really well here (or did before I worked on earlier stages...)
quickcompare('nrm_DPsd', df=dfo_flat_subset) #looks good
quickcompare('psk_DPsd', df=dfo_flat_subset)
In [30]:
scales = dfo.xs('scales', axis=1)
#scales
In [32]:
females = scales[scales.session_isfemale==True]
males = scales[scales.session_isfemale==False]
len(males)
allstats = scales.describe()[1:3]
malestats = males.describe()[1:3]
femalestats = females.describe()[1:3]
allstats.T
Out[32]:
In [ ]:
search= "ethnicity"
cols=concat_matches(scales, search)
cols.to_csv('ethnicity.csv')
In [125]:
concat_matches(scales, 'hours').describe().T
Out[125]:
In [123]:
len(scales.qmusic_dancelevel[scales.qmusic_dancelevel==0])
Out[123]:
In [ ]:
search= "yn$"
cols=concat_matches(scales, search)
#cols.to_csv('qualifiers_yn.csv')
print(search+'\n------')
cols.apply(pd.value_counts).T
In [26]:
dfo.to_csv('csv_dfo_929.csv')
In [6]:
x = dfo.xs('isip5_avgprev3', axis=1).lagdev_avgprev3sq_local
y = dfo.xs('isip8_avgprev3', axis=1).lagdev_avgprev3sq_local
x.corr(y)
Out[6]:
In [7]:
dfscatter = pd.concat([x, y], keys=['isip500local', 'isip800local'], axis=1)
#dfscatter_inverse = 1.0 / dfscatter
dfscatter.plot(x=0, y=1, kind='scatter')
#print((1 / x).corr(1 / y))
#(1 / dfscatter).plot(x=0, y=1, kind='scatter')
dfscatter.sort(columns='isip500local')
Out[7]:
In [10]:
lookup = lambda text: [c for c in dfo_flat.columns
if text in c]
lookup2 = lambda tup: set(lookup(tup[0])).intersection(set(lookup(tup[1])))
lookup2(('isip5', 'ints_count'))
Out[10]:
In [18]:
compare = pd.concat([scales, sms_jitter, sms_ticks, dfo_isip8], axis=1)
#scales.append(sms_jitter) #['tick_index'] = sms_ticks_index
#scales.append(sms_ticks)
#scales['jit_index'] = sms_jitter_index
compare
Out[18]: